from IPython.core.display import HTML
HTML("""
<style>
.output_png {
display: table-cell;
text-align: center;
vertical-align: middle;
horizontal-align: middle;
}
h1,h2 {
text-align: center;
background-color: black;
padding: 20px;
margin: 0;
color: yellow;
font-family: ariel;
border-radius: 80px
}
h3 {
text-align: center;
border-style: solid;
border-width: 3px;
padding: 12px;
margin: 0;
color: black;
font-family: ariel;
border-radius: 80px;
border-color: gold;
}
body, p {
font-family: ariel;
font-size: 15px;
color: charcoal;
}
div {
font-size: 14px;
margin: 0;
}
h4 {
padding: 0px;
margin: 0;
font-family: ariel;
color: purple;
}
</style>
""")
We invest in customers (acquisition costs, offline ads, promotions, discounts & etc.) to generate revenue and be profitable. Naturally, these actions make some customers super valuable in terms of lifetime value but there are always some customers who pull down the profitability. We need to identify these behavior patterns, segment customers and act accordingly.
Calculating Lifetime Value is the easy part. First we need to select a time window. It can be anything like 3, 6, 12, 24 months. By the equation below, we can have Lifetime Value for each customer in that specific time window:
Lifetime Value: Total Gross Revenue - Total Cost
This equation now gives us the historical lifetime value. If we see some customers having very high negative lifetime value historically, it could be too late to take an action. At this point, we need to predict the future with machine learning:
Deciding the time frame really depends on your industry, business model, strategy and more. For some industries, 1 year is a very long period while for the others it is very short. In our example, we will go ahead with 6 months.
RFM scores for each customer ID are the perfect candidates for feature set. To implement it correctly, we need to split our dataset. We will take 3 months of data, calculate RFM and use it for predicting next 6 months. So we need to create two dataframes first and append RFM scores to them.
#import libraries
from datetime import datetime, timedelta,date
import pandas as pd
%matplotlib inline
from sklearn.metrics import classification_report,confusion_matrix
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from __future__ import division
from sklearn.cluster import KMeans
from chart_studio import plotly as py
import plotly.offline as pyoff
import plotly.graph_objs as go
import xgboost as xgb
from sklearn.model_selection import KFold, cross_val_score, train_test_split
import xgboost as xgb
#initate plotly
pyoff.init_notebook_mode()
#read data from csv and redo the data work we done before
tx_data = pd.read_csv('Online_Retail.csv')
tx_data['InvoiceDate'] = pd.to_datetime(tx_data['InvoiceDate'])
tx_uk = tx_data.query("Country=='United Kingdom'").reset_index(drop=True)
#create 3m and 6m dataframes
tx_3m = tx_uk[(tx_uk.InvoiceDate.dt.date < date(2011,6,1)) & (tx_uk.InvoiceDate.dt.date >= date(2011,3,1))].reset_index(drop=True)
tx_6m = tx_uk[(tx_uk.InvoiceDate.dt.date >= date(2011,6,1)) & (tx_uk.InvoiceDate.dt.date < date(2011,12,1))].reset_index(drop=True)
#create tx_user for assigning clustering
tx_user = pd.DataFrame(tx_3m['CustomerID'].unique())
tx_user.columns = ['CustomerID']
#order cluster method
def order_cluster(cluster_field_name, target_field_name,df,ascending):
new_cluster_field_name = 'new_' + cluster_field_name
df_new = df.groupby(cluster_field_name)[target_field_name].mean().reset_index()
df_new = df_new.sort_values(by=target_field_name,ascending=ascending).reset_index(drop=True)
df_new['index'] = df_new.index
df_final = pd.merge(df,df_new[[cluster_field_name,'index']], on=cluster_field_name)
df_final = df_final.drop([cluster_field_name],axis=1)
df_final = df_final.rename(columns={"index":cluster_field_name})
return df_final
#calculate recency score
tx_max_purchase = tx_3m.groupby('CustomerID').InvoiceDate.max().reset_index()
tx_max_purchase.columns = ['CustomerID','MaxPurchaseDate']
tx_max_purchase['Recency'] = (tx_max_purchase['MaxPurchaseDate'].max() - tx_max_purchase['MaxPurchaseDate']).dt.days
tx_user = pd.merge(tx_user, tx_max_purchase[['CustomerID','Recency']], on='CustomerID')
kmeans = KMeans(n_clusters=4)
kmeans.fit(tx_user[['Recency']])
tx_user['RecencyCluster'] = kmeans.predict(tx_user[['Recency']])
tx_user = order_cluster('RecencyCluster', 'Recency',tx_user,False)
C:\Users\USER\anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
#calcuate frequency score
tx_frequency = tx_3m.groupby('CustomerID').InvoiceDate.count().reset_index()
tx_frequency.columns = ['CustomerID','Frequency']
tx_user = pd.merge(tx_user, tx_frequency, on='CustomerID')
kmeans = KMeans(n_clusters=4)
kmeans.fit(tx_user[['Frequency']])
tx_user['FrequencyCluster'] = kmeans.predict(tx_user[['Frequency']])
tx_user = order_cluster('FrequencyCluster', 'Frequency',tx_user,True)
C:\Users\USER\anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
#calcuate revenue score
tx_3m['Revenue'] = tx_3m['UnitPrice'] * tx_3m['Quantity']
tx_revenue = tx_3m.groupby('CustomerID').Revenue.sum().reset_index()
tx_user = pd.merge(tx_user, tx_revenue, on='CustomerID')
kmeans = KMeans(n_clusters=4)
kmeans.fit(tx_user[['Revenue']])
tx_user['RevenueCluster'] = kmeans.predict(tx_user[['Revenue']])
tx_user = order_cluster('RevenueCluster', 'Revenue',tx_user,True)
C:\Users\USER\anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
#overall scoring
tx_user['OverallScore'] = tx_user['RecencyCluster'] + tx_user['FrequencyCluster'] + tx_user['RevenueCluster']
tx_user['Segment'] = 'Low-Value'
tx_user.loc[tx_user['OverallScore']>2,'Segment'] = 'Mid-Value'
tx_user.loc[tx_user['OverallScore']>4,'Segment'] = 'High-Value'
tx_user.head()
| CustomerID | Recency | RecencyCluster | Frequency | FrequencyCluster | Revenue | RevenueCluster | OverallScore | Segment | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 14620.0 | 12 | 3 | 30 | 0 | 393.28 | 0 | 3 | Mid-Value |
| 1 | 15194.0 | 6 | 3 | 64 | 0 | 1439.02 | 0 | 3 | Mid-Value |
| 2 | 18044.0 | 5 | 3 | 57 | 0 | 808.96 | 0 | 3 | Mid-Value |
| 3 | 18075.0 | 12 | 3 | 35 | 0 | 638.12 | 0 | 3 | Mid-Value |
| 4 | 15241.0 | 0 | 3 | 64 | 0 | 947.55 | 0 | 3 | Mid-Value |
Calculate 6 months LTV for each customer which we are going to use for training our model.
There is no cost specified in the dataset. That’s why Revenue becomes our LTV directly.
#calculate revenue and create a new dataframe for it
tx_6m['Revenue'] = tx_6m['UnitPrice'] * tx_6m['Quantity']
tx_user_6m = tx_6m.groupby('CustomerID')['Revenue'].sum().reset_index()
tx_user_6m.columns = ['CustomerID','m6_Revenue']
#plot LTV histogram
plot_data = [
go.Histogram(
x=tx_user_6m.query('m6_Revenue < 10000')['m6_Revenue']
)
]
plot_layout = go.Layout(
title='6m Revenue'
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
Histogram clearly shows we have customers with negative LTV. We have some outliers too. Filtering out the outliers makes sense to have a proper machine learning model.
We will merge our 3 months and 6 months dataframes to see correlations between LTV and the feature set we have.
tx_merge = pd.merge(tx_user, tx_user_6m, on='CustomerID', how='left')
tx_merge = tx_merge.fillna(0)
tx_graph = tx_merge.query("m6_Revenue < 30000")
plot_data = [
go.Scatter(
x=tx_graph.query("Segment == 'Low-Value'")['OverallScore'],
y=tx_graph.query("Segment == 'Low-Value'")['m6_Revenue'],
mode='markers',
name='Low',
marker= dict(size= 7,
line= dict(width=1),
color= 'blue',
opacity= 0.8
)
),
go.Scatter(
x=tx_graph.query("Segment == 'Mid-Value'")['OverallScore'],
y=tx_graph.query("Segment == 'Mid-Value'")['m6_Revenue'],
mode='markers',
name='Mid',
marker= dict(size= 9,
line= dict(width=1),
color= 'green',
opacity= 0.5
)
),
go.Scatter(
x=tx_graph.query("Segment == 'High-Value'")['OverallScore'],
y=tx_graph.query("Segment == 'High-Value'")['m6_Revenue'],
mode='markers',
name='High',
marker= dict(size= 11,
line= dict(width=1),
color= 'red',
opacity= 0.9
)
),
]
plot_layout = go.Layout(
yaxis= {'title': "6m LTV"},
xaxis= {'title': "RFM Score"},
title='LTV'
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
Before building the machine learning model, we need to identify what is the type of this machine learning problem. LTV itself is a regression problem. A machine learning model can predict the $ value of the LTV. But here, we want LTV segments. Because it makes it more actionable and easy to communicate with other people. By applying K-means clustering, we can identify our existing LTV groups and build segments on top of it.
Considering business part of this analysis, we need to treat customers differently based on their predicted LTV. For this example, we will apply clustering and have 3 segments (number of segments really depends on your business dynamics and goals):
We are going to apply K-means clustering to decide segments and observe their characteristics:
#remove outliers
tx_merge = tx_merge[tx_merge['m6_Revenue']<tx_merge['m6_Revenue'].quantile(0.99)]
#creating 3 clusters
kmeans = KMeans(n_clusters=3)
kmeans.fit(tx_merge[['m6_Revenue']])
tx_merge['LTVCluster'] = kmeans.predict(tx_merge[['m6_Revenue']])
#order cluster number based on LTV
tx_merge = order_cluster('LTVCluster', 'm6_Revenue',tx_merge,True)
#creatinga new cluster dataframe
tx_cluster = tx_merge.copy()
#see details of the clusters
tx_cluster.groupby('LTVCluster')['m6_Revenue'].describe()
C:\Users\USER\anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| LTVCluster | ||||||||
| 0 | 1394.0 | 396.137189 | 419.891843 | -609.40 | 0.000 | 294.220 | 682.4300 | 1429.87 |
| 1 | 371.0 | 2492.794933 | 937.341566 | 1445.31 | 1731.980 | 2162.930 | 3041.9550 | 5287.39 |
| 2 | 56.0 | 8222.565893 | 2983.572030 | 5396.44 | 6151.435 | 6986.545 | 9607.3225 | 16756.31 |
tx_cluster.query('LTVCluster == 2')
| CustomerID | Recency | RecencyCluster | Frequency | FrequencyCluster | Revenue | RevenueCluster | OverallScore | Segment | m6_Revenue | LTVCluster | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 1765 | 16843.0 | 5 | 3 | 59 | 0 | 1829.69 | 0 | 3 | Mid-Value | 6354.67 | 2 |
| 1766 | 13199.0 | 5 | 3 | 24 | 0 | 1865.12 | 0 | 3 | Mid-Value | 5421.98 | 2 |
| 1767 | 13777.0 | 14 | 3 | 32 | 0 | 1818.43 | 0 | 3 | Mid-Value | 13517.20 | 2 |
| 1768 | 17306.0 | 12 | 3 | 11 | 0 | 1520.78 | 0 | 3 | Mid-Value | 5421.87 | 2 |
| 1769 | 14895.0 | 12 | 3 | 64 | 0 | 1573.44 | 0 | 3 | Mid-Value | 6220.51 | 2 |
| 1770 | 14849.0 | 14 | 3 | 58 | 0 | 981.13 | 0 | 3 | Mid-Value | 5498.07 | 2 |
| 1771 | 17340.0 | 0 | 3 | 54 | 0 | 1442.42 | 0 | 3 | Mid-Value | 6355.49 | 2 |
| 1772 | 16779.0 | 27 | 2 | 65 | 0 | 1714.61 | 0 | 2 | Low-Value | 5813.88 | 2 |
| 1773 | 13985.0 | 22 | 2 | 33 | 0 | 640.80 | 0 | 2 | Low-Value | 6236.17 | 2 |
| 1774 | 15640.0 | 34 | 2 | 56 | 0 | 1420.58 | 0 | 2 | Low-Value | 5497.95 | 2 |
| 1775 | 16210.0 | 20 | 2 | 21 | 0 | 1716.27 | 0 | 2 | Low-Value | 6893.69 | 2 |
| 1776 | 14607.0 | 69 | 0 | 3 | 0 | 495.00 | 0 | 0 | Low-Value | 10846.10 | 2 |
| 1777 | 13090.0 | 46 | 1 | 25 | 0 | 1315.40 | 0 | 1 | Low-Value | 5396.44 | 2 |
| 1778 | 14062.0 | 46 | 1 | 26 | 0 | 1738.24 | 0 | 1 | Low-Value | 5630.22 | 2 |
| 1779 | 17865.0 | 4 | 3 | 79 | 1 | 1779.43 | 0 | 4 | Mid-Value | 6124.41 | 2 |
| 1780 | 17581.0 | 13 | 3 | 91 | 1 | 1988.83 | 0 | 4 | Mid-Value | 6567.97 | 2 |
| 1781 | 13319.0 | 12 | 3 | 84 | 1 | 1721.95 | 0 | 4 | Mid-Value | 5918.60 | 2 |
| 1782 | 16523.0 | 7 | 3 | 81 | 1 | 2061.80 | 0 | 4 | Mid-Value | 5675.95 | 2 |
| 1783 | 17139.0 | 18 | 2 | 78 | 1 | 2051.32 | 0 | 3 | Mid-Value | 5894.87 | 2 |
| 1784 | 13969.0 | 25 | 2 | 92 | 1 | 844.25 | 0 | 3 | Mid-Value | 6402.41 | 2 |
| 1785 | 14051.0 | 14 | 3 | 46 | 0 | 2574.07 | 1 | 4 | Mid-Value | 7813.71 | 2 |
| 1786 | 17381.0 | 11 | 3 | 48 | 0 | 6638.67 | 1 | 4 | Mid-Value | 6146.14 | 2 |
| 1787 | 12901.0 | 5 | 3 | 51 | 0 | 7566.20 | 1 | 4 | Mid-Value | 7946.90 | 2 |
| 1788 | 17857.0 | 14 | 3 | 11 | 0 | 6583.80 | 1 | 4 | Mid-Value | 15115.60 | 2 |
| 1789 | 14680.0 | 6 | 3 | 66 | 0 | 6226.81 | 1 | 4 | Mid-Value | 12556.39 | 2 |
| 1790 | 17677.0 | 14 | 3 | 58 | 0 | 3490.33 | 1 | 4 | Mid-Value | 9313.18 | 2 |
| 1791 | 15251.0 | 6 | 3 | 48 | 0 | 4045.77 | 1 | 4 | Mid-Value | 6281.34 | 2 |
| 1792 | 16705.0 | 6 | 3 | 50 | 0 | 2982.17 | 1 | 4 | Mid-Value | 7132.03 | 2 |
| 1793 | 13881.0 | 21 | 2 | 38 | 0 | 2544.12 | 1 | 3 | Mid-Value | 12850.94 | 2 |
| 1794 | 13340.0 | 84 | 0 | 51 | 0 | 3678.03 | 1 | 1 | Low-Value | 7058.45 | 2 |
| 1795 | 12971.0 | 4 | 3 | 73 | 1 | 2962.65 | 1 | 5 | High-Value | 6169.64 | 2 |
| 1796 | 16839.0 | 4 | 3 | 73 | 1 | 3729.42 | 1 | 5 | High-Value | 7945.35 | 2 |
| 1797 | 15189.0 | 5 | 3 | 75 | 1 | 3252.04 | 1 | 5 | High-Value | 10062.85 | 2 |
| 1798 | 13767.0 | 7 | 3 | 112 | 1 | 3019.22 | 1 | 5 | High-Value | 8910.04 | 2 |
| 1799 | 17675.0 | 12 | 3 | 128 | 1 | 3812.71 | 1 | 5 | High-Value | 11464.67 | 2 |
| 1800 | 14031.0 | 3 | 3 | 88 | 1 | 2740.10 | 1 | 5 | High-Value | 7459.18 | 2 |
| 1801 | 17107.0 | 16 | 3 | 169 | 1 | 4118.81 | 1 | 5 | High-Value | 6261.62 | 2 |
| 1802 | 13408.0 | 11 | 3 | 106 | 1 | 5433.89 | 1 | 5 | High-Value | 15076.38 | 2 |
| 1803 | 12921.0 | 5 | 3 | 117 | 1 | 2215.01 | 1 | 5 | High-Value | 11042.90 | 2 |
| 1804 | 13081.0 | 11 | 3 | 261 | 1 | 6808.80 | 1 | 5 | High-Value | 13781.27 | 2 |
| 1805 | 13078.0 | 6 | 3 | 102 | 1 | 2389.45 | 1 | 5 | High-Value | 7262.58 | 2 |
| 1806 | 15513.0 | 7 | 3 | 95 | 1 | 3655.20 | 1 | 5 | High-Value | 8603.26 | 2 |
| 1807 | 14194.0 | 4 | 3 | 178 | 1 | 2825.47 | 1 | 5 | High-Value | 7390.23 | 2 |
| 1808 | 15615.0 | 4 | 3 | 108 | 1 | 2272.76 | 1 | 5 | High-Value | 5652.18 | 2 |
| 1809 | 15856.0 | 11 | 3 | 129 | 1 | 2579.26 | 1 | 5 | High-Value | 6153.20 | 2 |
| 1810 | 16133.0 | 4 | 3 | 86 | 1 | 5837.74 | 1 | 5 | High-Value | 6914.64 | 2 |
| 1811 | 17428.0 | 20 | 2 | 101 | 1 | 4131.15 | 1 | 4 | Mid-Value | 9455.48 | 2 |
| 1812 | 15498.0 | 18 | 2 | 104 | 1 | 2322.17 | 1 | 4 | Mid-Value | 6210.40 | 2 |
| 1813 | 14367.0 | 20 | 2 | 198 | 1 | 2745.36 | 1 | 4 | Mid-Value | 5788.93 | 2 |
| 1814 | 15159.0 | 17 | 2 | 82 | 1 | 2310.40 | 1 | 4 | Mid-Value | 11730.37 | 2 |
| 1815 | 17735.0 | 26 | 2 | 184 | 1 | 3122.99 | 1 | 4 | Mid-Value | 7197.41 | 2 |
| 1816 | 16180.0 | 18 | 2 | 78 | 1 | 2745.43 | 1 | 4 | Mid-Value | 7472.05 | 2 |
| 1817 | 15039.0 | 6 | 3 | 436 | 2 | 6741.59 | 1 | 6 | High-Value | 6893.23 | 2 |
| 1818 | 16333.0 | 0 | 3 | 29 | 0 | 10578.72 | 2 | 5 | High-Value | 11992.40 | 2 |
| 1819 | 16422.0 | 4 | 3 | 129 | 1 | 10348.13 | 2 | 6 | High-Value | 16756.31 | 2 |
| 1820 | 13798.0 | 4 | 3 | 122 | 1 | 11105.20 | 2 | 6 | High-Value | 12913.99 | 2 |
There are few more step before training the machine learning model:
#convert categorical columns to numerical
tx_class = pd.get_dummies(tx_cluster)
#calculate and show correlations
corr_matrix = tx_class.corr()
corr_matrix['LTVCluster'].sort_values(ascending=False)
#create X and y, X will be feature set and y is the label - LTV
X = tx_class.drop(['LTVCluster','m6_Revenue'],axis=1)
y = tx_class['LTVCluster']
#split training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=56)
tx_cluster.head()
| CustomerID | Recency | RecencyCluster | Frequency | FrequencyCluster | Revenue | RevenueCluster | OverallScore | Segment | m6_Revenue | LTVCluster | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 14620.0 | 12 | 3 | 30 | 0 | 393.28 | 0 | 3 | Mid-Value | 0.00 | 0 |
| 1 | 18044.0 | 5 | 3 | 57 | 0 | 808.96 | 0 | 3 | Mid-Value | 991.54 | 0 |
| 2 | 18075.0 | 12 | 3 | 35 | 0 | 638.12 | 0 | 3 | Mid-Value | 1322.75 | 0 |
| 3 | 15241.0 | 0 | 3 | 64 | 0 | 947.55 | 0 | 3 | Mid-Value | 791.04 | 0 |
| 4 | 15660.0 | 4 | 3 | 34 | 0 | 484.62 | 0 | 3 | Mid-Value | 858.09 | 0 |
This was our dataset before get_dummies(). We have one categorical column which is Segment. What happens after applying get_dummies():
Let’s start with the first line. get_dummies() method converts categorical columns to 0–1 notations. See what it exactly does with the example:
tx_class.head()
| CustomerID | Recency | RecencyCluster | Frequency | FrequencyCluster | Revenue | RevenueCluster | OverallScore | m6_Revenue | LTVCluster | Segment_High-Value | Segment_Low-Value | Segment_Mid-Value | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 14620.0 | 12 | 3 | 30 | 0 | 393.28 | 0 | 3 | 0.00 | 0 | 0 | 0 | 1 |
| 1 | 18044.0 | 5 | 3 | 57 | 0 | 808.96 | 0 | 3 | 991.54 | 0 | 0 | 0 | 1 |
| 2 | 18075.0 | 12 | 3 | 35 | 0 | 638.12 | 0 | 3 | 1322.75 | 0 | 0 | 0 | 1 |
| 3 | 15241.0 | 0 | 3 | 64 | 0 | 947.55 | 0 | 3 | 791.04 | 0 | 0 | 0 | 1 |
| 4 | 15660.0 | 4 | 3 | 34 | 0 | 484.62 | 0 | 3 | 858.09 | 0 | 0 | 0 | 1 |
Segment column is gone but we have new numerical ones which represent it. We have converted it to 3 different columns with 0 and 1 and made it usable for our machine learning model.
corr_matrix = tx_class.corr()
corr_matrix["LTVCluster"] = corr_matrix["LTVCluster"].astype(float)
corr_matrix["LTVCluster"].sort_values(ascending=False)
LTVCluster 1.000000 m6_Revenue 0.845933 Revenue 0.600491 RevenueCluster 0.463930 OverallScore 0.373231 FrequencyCluster 0.366366 Frequency 0.359601 Segment_High-Value 0.353218 RecencyCluster 0.236899 Segment_Mid-Value 0.166854 CustomerID -0.028401 Recency -0.237249 Segment_Low-Value -0.266008 Name: LTVCluster, dtype: float64
We see that 3 months Revenue, Frequency and RFM scores will be helpful for our machine learning models.
#XGBoost Multiclassification Model
ltv_xgb_model = xgb.XGBClassifier(max_depth=5, learning_rate=0.1,objective= 'multi:softprob',n_jobs=-1).fit(X_train, y_train)
print('Accuracy of XGB classifier on training set: {:.2f}'
.format(ltv_xgb_model.score(X_train, y_train)))
print('Accuracy of XGB classifier on test set: {:.2f}'
.format(ltv_xgb_model.score(X_test[X_train.columns], y_test)))
y_pred = ltv_xgb_model.predict(X_test)
print(classification_report(y_test, y_pred))
Accuracy of XGB classifier on training set: 0.91
Accuracy of XGB classifier on test set: 0.86
precision recall f1-score support
0 0.89 0.97 0.93 70
1 0.75 0.50 0.60 18
2 0.50 0.50 0.50 4
accuracy 0.86 92
macro avg 0.71 0.66 0.68 92
weighted avg 0.85 0.86 0.85 92
Accuracy shows 86% on the test set.
Precision and recall are acceptable for 0. As an example, for cluster 0 (Low LTV), if model tells us this customer belongs to cluster 0, 90 out of 100 will be correct (precision). And the model successfully identifies 93% of actual cluster 0 customers (recall). We really need to improve the model for other clusters. For example, we barely detect 56% of Mid LTV customers. Possible actions to improve those points:
Adding more features and improve feature engineering Try different models other than XGBoost Apply hyper parameter tuning to current model Add more data to the model if possible
We definitely do not want to lose customers with high LTV. So we will focus on Churn Prediction